#!/bin/bash
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

set -eu

. wmt14_lib.sh

mkdir -p "${ROOT}/tokenized/train"

function clean_train {
  local base=$1
  local output_base="${ROOT}/tokenized/train/${base}"

  # Requires both directions to run and will output both.
  "${TRAIN_CLEANER}" "${output_base}" ${SRC} ${TGT} \
    "${output_base}.clean" 1 100
  "${TRAIN_CLEANER}" "${output_base}" ${TGT} ${SRC} \
    "${output_base}.clean" 1 100
}

# TODO(drpng): eliminate en/de patterns.
tokenize "commoncrawl.de-en.de" de train commoncrawl
tokenize "commoncrawl.de-en.en" en train commoncrawl
tokenize "training/europarl-v7.de-en.de" de train europarl-v7
tokenize "training/europarl-v7.de-en.en" en train europarl-v7
tokenize "training/news-commentary-v9.de-en.de" de train news-commentary-v9
tokenize "training/news-commentary-v9.de-en.en" en train news-commentary-v9

clean_train commoncrawl
clean_train europarl-v7
clean_train news-commentary-v9
